/*******************************************************************************
																				
	DESCRIPTION: 	This do file creates the datasets that are to be fed into R 
					to create the ensemble predictions for the models with 
					the expanded employment history variables.
									
*******************************************************************************/

clear all
global id_code 002

************************************************************************
* 1. Preparation to split the dataset
************************************************************************

* Importing data
use "${data}/001_9_FinalMainDataset.dta", clear
drop L_Occupation*

* Generating random variable so that training sample is a random sample
set seed 2111
gen double random = runiform()
isid random // Ensure random gives a unique ordering of data
sort random
drop random
gen n = _n

* Creating dummy variables for non-ordinal categorical variables
foreach var in L_Industry_3digit L_civilStatus EducLevel L_Municipality L_Age_Youngest citizenship migrationCohort {
	tab `var', gen(`var'_dummy)
	drop `var'
}

************************************************************************
* 2. Define a new macro that includes all employment history related variables
************************************************************************

global emplHist_expanded L_emplStatu nEmployers1Y nEmployers2Y ///
	nEmployers3Y nEmployers4Y nEmployers5Y tenure L_nEmployees_L1L2 ///
	L_firmSizeChange_L1L2 L_layoffRate_L1L2 L_emplStatu_Missing ///
	nEmployers1Y_Missing nEmployers2Y_Missing nEmployers3Y_Missing ///
	nEmployers4Y_Missing nEmployers5Y_Missing tenure_Missing ///
	L_nEmployees_L1L2Missing L_firmSizeChange_L1L2Missing ///
	L_layoffRate_L1L2Missing unemplSpells1Ybefore unemplSpells2Ybefore ///
	unemplSpells3Ybefore unemplSpells4Ybefore unemplSpells5Ybefore ///
	DaysOnDI_1Years DaysOnDI_2Years DaysOnDI_3Years DaysOnDI_4Years ///
	DaysOnDI_5Years DaysUnemp_1Years DaysUnemp_2Years DaysUnemp_3Years ///
	DaysUnemp_4Years DaysUnemp_5Years DaysOnDI_1Years_Miss ///
	DaysOnDI_2Years_Miss DaysOnDI_3Years_Miss DaysOnDI_4Years_Miss ///
	DaysOnDI_5Years_Miss DaysUnemp_1Years_Miss DaysUnemp_2Years_Miss ///
	DaysUnemp_3Years_Miss DaysUnemp_4Years_Miss DaysUnemp_5Years_Miss ///
	DaysOnDI_1YearsMissing DaysOnDI_2YearsMissing DaysOnDI_3YearsMissing ///
	DaysOnDI_4YearsMissing DaysOnDI_5YearsMissing DaysUnemp_1YearsMissing ///
	DaysUnemp_2YearsMissing DaysUnemp_3YearsMissing DaysUnemp_4YearsMissing ///
	DaysUnemp_5YearsMissing

************************************************************************
* 3. Sample preparation
************************************************************************
pause off

* Define the year span and set of variables. We reduce to only the "basic" model 
* and employment history variables:
local sampleName Full
local yearSpan 2006/2006
local varSet "$demoEdu $emplHist_expanded"	

* Keep only observations from a particular year (note, this isn't done for months 1, 2, 4, 5, etc.)		************************************************************************

forval y = `yearSpan' {

	preserve

	gen temp = 0
	
	forvalues i = 0(3)12{

		replace emplAft3M_`i'M_In = . if !inrange(startU + `i'*30, d(01jan`y'),	///
			d(31dec`y'))
		replace emplAft6M_`i'M_In = . if !inrange(startU + `i'*30, d(01jan`y'),	///
			d(31dec`y'))
			
		replace temp = 1 if emplAft3M_`i'M_In!=.
	}	
	
	replace emplAft1M_0M_In = . if !inrange(startU, d(01jan`y'), d(31dec`y'))
	
	replace emplAft12M_0M_In = . if !inrange(startU, d(01jan`y'), d(31dec`y'))
	
	drop if temp==0
	drop temp
	
	************************************************************************
	* Keep only observations and variables that belong to the sample
	************************************************************************
	* This is indicated by variable inSample_`sampleName' generate in code 001_09
	keep if inSample_`sampleName' == 1
	pause
	
	* Keep only variables that belong to the model
	keep LopNr_PersonNr InLnr `varSet' inSample_`sampleName' emplAft* n 
	pause
	
	************************************************************************
	* Prepare and save the sample
	************************************************************************

	* Generate variable used to split dataset into parameter tuning, training
	* and prediction partitions.
	sort n
	gen n_order = _n	

	rename inSample_`sampleName' inSample
	
	* The last variables should be covariates starting from gender 
	* This is used to tell R which variables should be included in the model
	order LopNr_PersonNr InLnr n n_order inSample emplAft* Gender, first

	compress
	
	save "${data}/${id_code}_DataForR_BasicWithEmplHist_`y'.dta", replace

	restore

}

************************************************************************
* 4. Creating sub-models for different sets of employment history variables
*	 by cumulatively adding them
************************************************************************

* Define the sets of variables that are dropped to create sub-models
global All

global Yminus5 "tenure* L_nEmployees_L1L2* L_firmSizeChange_L1L2* L_layoffRate_L1L2*"

global Yminus4 "nEmployers5Y* unemplSpells5Ybefore DaysOnDI_5Years* DaysUnemp_5Years*"

global Yminus3 "nEmployers4Y* unemplSpells4Ybefore DaysOnDI_4Years* DaysUnemp_4Years*"

global Yminus2 "nEmployers3Y* unemplSpells3Ybefore DaysOnDI_3Years* DaysUnemp_3Years*"

global Yminus1 "nEmployers2Y* unemplSpells2Ybefore DaysOnDI_2Years* DaysUnemp_2Years*"

global Basic "nEmployers1Y* unemplSpells1Ybefore DaysOnDI_1Years* DaysUnemp_1Years* L_emplStatu*"
	
	
* Recode the variables so that year Y variables include only year Y information
* (e.g., days on UI 5 years ago, not days on UI up to 5 years ago) 
use "${data}/${id_code}_DataForR_BasicWithEmplHist_2006.dta", replace
forval y = 5(-1)2 {
	replace nEmployers`y'Y = nEmployers`y'Y - nEmployers`=`y'-1'Y
	replace unemplSpells`y'Ybefore = unemplSpells`y'Ybefore - unemplSpells`=`y'-1'Ybefore
	replace DaysOnDI_`y'Years = DaysOnDI_`y'Years - DaysOnDI_`=`y'-1'Years
	replace DaysUnemp_`y'Years  = DaysUnemp_`y'Years - DaysUnemp_`=`y'-1'Years
	
	replace DaysOnDI_`y'Years_Miss = DaysOnDI_`y'Years_Miss - DaysOnDI_`=`y'-1'Years_Miss
	replace DaysUnemp_`y'Years_Miss  = DaysUnemp_`y'Years_Miss - DaysUnemp_`=`y'-1'Years_Miss
}

* Save cumulative submodels:
preserve
foreach varDrop in All Yminus5 Yminus4 Yminus3 Yminus2 Yminus1 Basic {

	cap drop $`varDrop'

	save "${data}/${id_code}_DataForR_BasicWithEmplHist_`varDrop'_2006", replace
}
restore

* Save marginal submodels:
global Marg_Yminus1 $Yminus5 $Yminus4 $Yminus3 $Yminus2 $Yminus1
global Marg_Yminus2 $Yminus5 $Yminus4 $Yminus3 $Yminus2 $Basic
global Marg_Yminus3 $Yminus5 $Yminus4 $Yminus3 $Yminus1 $Basic
global Marg_Yminus4 $Yminus5 $Yminus4 $Yminus2 $Yminus1 $Basic
global Marg_Yminus5 $Yminus5 $Yminus3 $Yminus2 $Yminus1 $Basic
global Marg_PreUnemp $Yminus4 $Yminus3 $Yminus2 $Yminus1 $Basic

foreach varDrop in Marg_PreUnemp Marg_Yminus5 Marg_Yminus4 Marg_Yminus3 Marg_Yminus2 Marg_Yminus1 {
	preserve
	capture drop $`varDrop'
	save "${data}/${id_code}_DataForR_BasicWithEmplHist_`varDrop'_2006", replace
	restore
}


************************************************************************
* 5. Creating sub-models for different employment history variables
*	 by cumulatively adding them (INDIVIDUAL VARIABLES)
************************************************************************

* Create variable group with all the missings:
global missings DaysOnDI_2Years_Miss DaysOnDI_2YearsMissing DaysUnemp_2Years_Miss ///
		DaysUnemp_2YearsMissing nEmployers2Y_Missing tenure_Missing ///
		L_nEmployees_L1L2Missing L_firmSizeChange_L1L2Missing L_layoffRate_L1L2Missing
	
* Import data again:
use "${data}/${id_code}_DataForR_BasicWithEmplHist_2006.dta", replace

* Keep only Y-2 and pre-unemployment variables:
drop $Yminus4 $Yminus3 $Yminus2 $Basic

* Save full submodel:
save "${data}/${id_code}_DataForR_BasicWithEmplHist_IndivVars_Seq_2006", replace

* Save sequential submodels:
preserve
foreach varDrop in  missings L_layoffRate_L1L2 L_firmSizeChange_L1L2 ///
	L_nEmployees_L1L2 tenure DaysOnDI_2Years nEmployers2Y ///
	unemplSpells2Ybefore DaysUnemp_2Years  {

	 if "`varDrop'" == "missings" {
	 	drop $missings
	 } 
	 
	 else {
		drop `varDrop'
	 }

	save "${data}/${id_code}_DataForR_BasicWithEmplHist_IndivVars_Seq_`varDrop'_2006", replace
}
restore

* Save marginal submodels:
foreach varDrop in  missings L_layoffRate_L1L2 L_firmSizeChange_L1L2 ///
	L_nEmployees_L1L2 tenure DaysOnDI_2Years nEmployers2Y ///
	unemplSpells2Ybefore DaysUnemp_2Years  {

	preserve
	
	if "`varDrop'" == "missings" {
		foreach v in $`varDrop' {
			gen t_`v' = `v'
		}
		drop $Yminus1 $Yminus5
		
		foreach v in $`varDrop' {
			rename t_`v' `v'
		}
	 } 
	 
	 else {
		gen temp = `varDrop'
		drop $Yminus1 $Yminus5
		rename temp `varDrop'
	 }

	save "${data}/${id_code}_DataForR_BasicWithEmplHist_IndivVars_Marg_`varDrop'_2006", replace
	restore
}

